





<!DOCTYPE html>
<html class="writer-html5" lang="zh-CN" >
<head>
  <meta charset="utf-8">
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
  <title>Introduction to TOPI &mdash; tvm 0.8.dev1982 文档</title>
  

  
  <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0/css/bootstrap.min.css" integrity="sha384-Gn5384xqQ1aoWXA+058RXPxPg6fy4IWvTNh0E263XmFcJlSAwiGgFAW/dAiS6JXm" crossorigin="anonymous">
  <link rel="stylesheet" href="../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../_static/gallery.css" type="text/css" />
  <link rel="stylesheet" href="../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../_static/css/tlcpack_theme.css" type="text/css" />

  
  
    <link rel="shortcut icon" href="../_static/tvm-logo-square.png"/>
  

  
  
  
  
    
      <script type="text/javascript" id="documentation_options" data-url_root="../" src="../_static/documentation_options.js"></script>
        <script data-url_root="../" id="documentation_options" src="../_static/documentation_options.js"></script>
        <script src="../_static/jquery.js"></script>
        <script src="../_static/underscore.js"></script>
        <script src="../_static/doctools.js"></script>
        <script src="../_static/translations.js"></script>
    
    <script type="text/javascript" src="../_static/js/theme.js"></script>

    
    <script type="text/javascript" src="../_static/js/tlcpack_theme.js"></script>
    <link rel="index" title="索引" href="../genindex.html" />
    <link rel="search" title="搜索" href="../search.html" />
    <link rel="next" title="How To Guides" href="../how_to/index.html" />
    <link rel="prev" title="编译深度学习模型的快速开始教程" href="relay_quick_start.html" /> 
</head>

<body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
    
    
<header class="header">
    <div class="innercontainer">
      <div class="headerInner d-flex justify-content-between align-items-center">
          <div class="headerLogo">
               <a href="https://tvm.apache.org/"><img src=https://tvm.apache.org/assets/images/logo.svg alt="logo"></a>
          </div>

          <div id="headMenu" class="headerNav">
            <button type="button" id="closeHeadMenu" class="navCloseBtn"><img src="../_static/img/close-icon.svg" alt="Close"></button>
             <ul class="nav">
                <li class="nav-item">
                   <a class="nav-link" href=https://tvm.apache.org/community>Community</a>
                </li>
                <li class="nav-item">
                   <a class="nav-link" href=https://tvm.apache.org/download>Download</a>
                </li>
                <li class="nav-item">
                   <a class="nav-link" href=https://tvm.apache.org/vta>VTA</a>
                </li>
                <li class="nav-item">
                   <a class="nav-link" href=https://tvm.apache.org/blog>Blog</a>
                </li>
                <li class="nav-item">
                   <a class="nav-link" href=https://tvm.apache.org/docs>Docs</a>
                </li>
                <li class="nav-item">
                   <a class="nav-link" href=https://tvmconf.org>Conference</a>
                </li>
                <li class="nav-item">
                   <a class="nav-link" href=https://github.com/apache/tvm/>Github</a>
                </li>
                <li class="nav-item">
                   <a class="nav-link" href=https://tvmchinese.github.io/declaration_zh_CN.html>About-Translators</a>
                </li>
             </ul>
               <div class="responsivetlcdropdown">
                 <button type="button" class="btn-link">
                   ASF
                 </button>
                 <ul>
                     <li>
                       <a href=https://apache.org/>Apache Homepage</a>
                     </li>
                     <li>
                       <a href=https://www.apache.org/licenses/>License</a>
                     </li>
                     <li>
                       <a href=https://www.apache.org/foundation/sponsorship.html>Sponsorship</a>
                     </li>
                     <li>
                       <a href=https://www.apache.org/security/>Security</a>
                     </li>
                     <li>
                       <a href=https://www.apache.org/foundation/thanks.html>Thanks</a>
                     </li>
                     <li>
                       <a href=https://www.apache.org/events/current-event>Events</a>
                     </li>
                     <li>
                       <a href=https://www.zhihu.com/column/c_1429578595417563136>Zhihu</a>
                     </li>
                 </ul>
               </div>
          </div>
            <div class="responsiveMenuIcon">
              <button type="button" id="menuBtn" class="btn-menu"><img src="../_static/img/menu-icon.svg" alt="Menu Icon"></button>
            </div>

            <div class="tlcDropdown">
              <div class="dropdown">
                <button type="button" class="btn-link dropdown-toggle" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">
                  ASF
                </button>
                <div class="dropdown-menu dropdown-menu-right">
                  <ul>
                     <li>
                       <a href=https://apache.org/>Apache Homepage</a>
                     </li>
                     <li>
                       <a href=https://www.apache.org/licenses/>License</a>
                     </li>
                     <li>
                       <a href=https://www.apache.org/foundation/sponsorship.html>Sponsorship</a>
                     </li>
                     <li>
                       <a href=https://www.apache.org/security/>Security</a>
                     </li>
                     <li>
                       <a href=https://www.apache.org/foundation/thanks.html>Thanks</a>
                     </li>
                     <li>
                       <a href=https://www.apache.org/events/current-event>Events</a>
                     </li>
                     <li>
                       <a href=https://www.zhihu.com/column/c_1429578595417563136>Zhihu</a>
                     </li>
                  </ul>
                </div>
              </div>
          </div>
       </div>
    </div>
 </header>
 
    <nav data-toggle="wy-nav-shift" class="wy-nav-side fixed">
      <div class="wy-side-scroll">
        <div class="wy-side-nav-search" >
          

          
            <a href="../index.html">
          

          
            
            <img src="../_static/tvm-logo-small.png" class="logo" alt="Logo"/>
          
          </a>

          
            
            
                <div class="version">
                  0.8.dev1982
                </div>
            
          

          
<div role="search">
  <form id="rtd-search-form" class="wy-form" action="../search.html" method="get">
    <input type="text" name="q" placeholder="Search docs" />
    <input type="hidden" name="check_keywords" value="yes" />
    <input type="hidden" name="area" value="default" />
  </form>
</div>

          
        </div>

        
        <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
          
            
            
              
            
            
              <p class="caption" role="heading"><span class="caption-text">如何开始</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../install/index.html">安装 TVM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../contribute/index.html">贡献者指南</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">用户引导</span></p>
<ul class="current">
<li class="toctree-l1 current"><a class="reference internal" href="index.html">User Tutorial</a><ul class="current">
<li class="toctree-l2"><a class="reference internal" href="introduction.html">介绍</a></li>
<li class="toctree-l2"><a class="reference internal" href="introduction.html#an-overview-of-tvm-and-model-optimization">TVM和模型优化的概述</a></li>
<li class="toctree-l2"><a class="reference internal" href="install.html">安装 TVM</a></li>
<li class="toctree-l2"><a class="reference internal" href="tvmc_command_line_driver.html">使用TVMC编译和优化一个模型</a></li>
<li class="toctree-l2"><a class="reference internal" href="autotvm_relay_x86.html">Compiling and Optimizing a Model with the Python Interface (AutoTVM)</a></li>
<li class="toctree-l2"><a class="reference internal" href="tensor_expr_get_started.html">使用张量表达式来处理运算符</a></li>
<li class="toctree-l2"><a class="reference internal" href="autotvm_matmul_x86.html">Optimizing Operators with Schedule Templates and AutoTVM</a></li>
<li class="toctree-l2"><a class="reference internal" href="auto_scheduler_matmul_x86.html">Optimizing Operators with Auto-scheduling</a></li>
<li class="toctree-l2"><a class="reference internal" href="cross_compilation_and_rpc.html">Cross Compilation and RPC</a></li>
<li class="toctree-l2"><a class="reference internal" href="relay_quick_start.html">编译深度学习模型的快速开始教程</a></li>
<li class="toctree-l2 current"><a class="current reference internal" href="#">Introduction to TOPI</a><ul>
<li class="toctree-l3"><a class="reference internal" href="#basic-example">Basic example</a></li>
<li class="toctree-l3"><a class="reference internal" href="#numpy-style-operator-overloading">Numpy-style operator overloading</a></li>
<li class="toctree-l3"><a class="reference internal" href="#generic-schedules-and-fusing-operations">Generic schedules and fusing operations</a></li>
<li class="toctree-l3"><a class="reference internal" href="#fusing-convolutions">Fusing convolutions</a></li>
<li class="toctree-l3"><a class="reference internal" href="#summary">总结</a></li>
</ul>
</li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="../how_to/index.html">How To Guides</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">开发者引导</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../dev/tutorial/index.html">Developer Tutorial</a></li>
<li class="toctree-l1"><a class="reference internal" href="../dev/how_to/how_to.html">开发者指南</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">架构指南</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../arch/index.html">Design and Architecture</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">主题引导</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../topic/microtvm/index.html">microTVM：裸机使用TVM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../topic/vta/index.html">VTA: Versatile Tensor Accelerator</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">参考指南</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../reference/langref/index.html">语言参考</a></li>
<li class="toctree-l1"><a class="reference internal" href="../reference/api/python/index.html">Python API</a></li>
<li class="toctree-l1"><a class="reference internal" href="../reference/api/links.html">Other APIs</a></li>
<li class="toctree-l1"><a class="reference internal" href="../reference/publications.html">Publications</a></li>
<li class="toctree-l1"><a class="reference internal" href="../genindex.html">索引</a></li>
</ul>

            
          
        </div>
        
      </div>
    </nav>

    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
      
      <nav class="wy-nav-top" aria-label="top navigation" data-toggle="wy-nav-top">
        
            <div class="togglemenu">

            </div>
            <div class="nav-content">
              <!-- tvm -->
              Table of content
            </div>
        
      </nav>


      <div class="wy-nav-content">
        
        <div class="rst-content">
        

          




















<div role="navigation" aria-label="breadcrumbs navigation">

  <ul class="wy-breadcrumbs">
    
      <li><a href="../index.html">Docs</a> <span class="br-arrow">></span></li>
        
          <li><a href="index.html">User Tutorial</a> <span class="br-arrow">></span></li>
        
      <li>Introduction to TOPI</li>
    
    
      <li class="wy-breadcrumbs-aside">
        
            
            <a href="../_sources/tutorial/intro_topi.rst.txt" rel="nofollow"> <img src="../_static//img/source.svg" alt="viewsource"/></a>
          
        
      </li>
    
  </ul>

  
  <hr/>
</div>
          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
           <div itemprop="articleBody">
            
  <div class="sphx-glr-download-link-note admonition note">
<p class="admonition-title">注解</p>
<p>点击 <a class="reference internal" href="#sphx-glr-download-tutorial-intro-topi-py"><span class="std std-ref">此处</span></a> 获取完整示例代码</p>
</div>
<div class="sphx-glr-example-title section" id="introduction-to-topi">
<span id="sphx-glr-tutorial-intro-topi-py"></span><h1>Introduction to TOPI<a class="headerlink" href="#introduction-to-topi" title="永久链接至标题">¶</a></h1>
<p><strong>作者</strong>: <a class="reference external" href="https://github.com/ehsanmok">Ehsan M. Kermani</a></p>
<p>This is an introductory tutorial to TVM Operator Inventory (TOPI).
TOPI provides numpy-style generic operations and schedules with higher abstractions than TVM.
In this tutorial, we will see how TOPI can save us from writing boilerplates code in TVM.</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">__future__</span> <span class="k">import</span> <span class="n">absolute_import</span><span class="p">,</span> <span class="n">print_function</span>

<span class="kn">import</span> <span class="nn">tvm</span>
<span class="kn">import</span> <span class="nn">tvm.testing</span>
<span class="kn">from</span> <span class="nn">tvm</span> <span class="k">import</span> <span class="n">te</span>
<span class="kn">from</span> <span class="nn">tvm</span> <span class="k">import</span> <span class="n">topi</span>
<span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
</pre></div>
</div>
<div class="section" id="basic-example">
<h2>Basic example<a class="headerlink" href="#basic-example" title="永久链接至标题">¶</a></h2>
<p>Let’s revisit the sum of rows operation (equivalent to <code class="code docutils literal notranslate"><span class="pre">B</span> <span class="pre">=</span> <span class="pre">numpy.sum(A,</span> <span class="pre">axis=1)</span></code>’) To compute the sum of rows of a two dimensional TVM tensor A, we should
specify the symbolic operation as well as schedule as follows</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">n</span> <span class="o">=</span> <span class="n">te</span><span class="o">.</span><span class="n">var</span><span class="p">(</span><span class="s2">&quot;n&quot;</span><span class="p">)</span>
<span class="n">m</span> <span class="o">=</span> <span class="n">te</span><span class="o">.</span><span class="n">var</span><span class="p">(</span><span class="s2">&quot;m&quot;</span><span class="p">)</span>
<span class="n">A</span> <span class="o">=</span> <span class="n">te</span><span class="o">.</span><span class="n">placeholder</span><span class="p">((</span><span class="n">n</span><span class="p">,</span> <span class="n">m</span><span class="p">),</span> <span class="n">name</span><span class="o">=</span><span class="s2">&quot;A&quot;</span><span class="p">)</span>
<span class="n">k</span> <span class="o">=</span> <span class="n">te</span><span class="o">.</span><span class="n">reduce_axis</span><span class="p">((</span><span class="mi">0</span><span class="p">,</span> <span class="n">m</span><span class="p">),</span> <span class="s2">&quot;k&quot;</span><span class="p">)</span>
<span class="n">B</span> <span class="o">=</span> <span class="n">te</span><span class="o">.</span><span class="n">compute</span><span class="p">((</span><span class="n">n</span><span class="p">,),</span> <span class="k">lambda</span> <span class="n">i</span><span class="p">:</span> <span class="n">te</span><span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="n">A</span><span class="p">[</span><span class="n">i</span><span class="p">,</span> <span class="n">k</span><span class="p">],</span> <span class="n">axis</span><span class="o">=</span><span class="n">k</span><span class="p">),</span> <span class="n">name</span><span class="o">=</span><span class="s2">&quot;B&quot;</span><span class="p">)</span>
<span class="n">s</span> <span class="o">=</span> <span class="n">te</span><span class="o">.</span><span class="n">create_schedule</span><span class="p">(</span><span class="n">B</span><span class="o">.</span><span class="n">op</span><span class="p">)</span>
</pre></div>
</div>
<p>and to examine the IR code in human readable format, we can do</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="nb">print</span><span class="p">(</span><span class="n">tvm</span><span class="o">.</span><span class="n">lower</span><span class="p">(</span><span class="n">s</span><span class="p">,</span> <span class="p">[</span><span class="n">A</span><span class="p">],</span> <span class="n">simple_mode</span><span class="o">=</span><span class="kc">True</span><span class="p">))</span>
</pre></div>
</div>
<p class="sphx-glr-script-out">输出:</p>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>primfn(A_1: handle) -&gt; ()
  attr = {&quot;from_legacy_te_schedule&quot;: True, &quot;global_symbol&quot;: &quot;main&quot;, &quot;tir.noalias&quot;: True}
  buffers = {A: Buffer(A_2: Pointer(float32), float32, [n: int32, m: int32], [stride: int32, stride_1: int32], type=&quot;auto&quot;)}
  buffer_map = {A_1: A} {
  allocate(B: Pointer(global float32), float32, [n]), storage_scope = global;
  for (i: int32, 0, n) {
    B[i] = 0f32
    for (k: int32, 0, m) {
      B[i] = ((float32*)B[i] + (float32*)A_2[((i*stride) + (k*stride_1))])
    }
  }
}
</pre></div>
</div>
<p>However, for such a common operation we had to define the reduce axis ourselves as well as explicit computation with
<code class="code docutils literal notranslate"><span class="pre">te.compute</span></code>. Imagine for more complicated operations how much details we need to provide.
Fortunately, we can replace those two lines with simple <code class="code docutils literal notranslate"><span class="pre">topi.sum</span></code> much like <code class="code docutils literal notranslate"><span class="pre">numpy.sum</span></code></p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">C</span> <span class="o">=</span> <span class="n">topi</span><span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="n">A</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
<span class="n">ts</span> <span class="o">=</span> <span class="n">te</span><span class="o">.</span><span class="n">create_schedule</span><span class="p">(</span><span class="n">C</span><span class="o">.</span><span class="n">op</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="n">tvm</span><span class="o">.</span><span class="n">lower</span><span class="p">(</span><span class="n">ts</span><span class="p">,</span> <span class="p">[</span><span class="n">A</span><span class="p">],</span> <span class="n">simple_mode</span><span class="o">=</span><span class="kc">True</span><span class="p">))</span>
</pre></div>
</div>
<p class="sphx-glr-script-out">输出:</p>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>primfn(A_1: handle) -&gt; ()
  attr = {&quot;from_legacy_te_schedule&quot;: True, &quot;global_symbol&quot;: &quot;main&quot;, &quot;tir.noalias&quot;: True}
  buffers = {A: Buffer(A_2: Pointer(float32), float32, [n: int32, m: int32], [stride: int32, stride_1: int32], type=&quot;auto&quot;)}
  buffer_map = {A_1: A} {
  allocate(A_red: Pointer(global float32), float32, [n]), storage_scope = global;
  for (ax0: int32, 0, n) {
    A_red[ax0] = 0f32
    for (k1: int32, 0, m) {
      A_red[ax0] = ((float32*)A_red[ax0] + (float32*)A_2[((ax0*stride) + (k1*stride_1))])
    }
  }
}
</pre></div>
</div>
</div>
<div class="section" id="numpy-style-operator-overloading">
<h2>Numpy-style operator overloading<a class="headerlink" href="#numpy-style-operator-overloading" title="永久链接至标题">¶</a></h2>
<p>We can add two tensors using <code class="code docutils literal notranslate"><span class="pre">topi.broadcast_add</span></code> that have correct (broadcastable with specific) shapes.
Even shorter, TOPI provides operator overloading for such common operations. For example,</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">x</span><span class="p">,</span> <span class="n">y</span> <span class="o">=</span> <span class="mi">100</span><span class="p">,</span> <span class="mi">10</span>
<span class="n">a</span> <span class="o">=</span> <span class="n">te</span><span class="o">.</span><span class="n">placeholder</span><span class="p">((</span><span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">,</span> <span class="n">y</span><span class="p">),</span> <span class="n">name</span><span class="o">=</span><span class="s2">&quot;a&quot;</span><span class="p">)</span>
<span class="n">b</span> <span class="o">=</span> <span class="n">te</span><span class="o">.</span><span class="n">placeholder</span><span class="p">((</span><span class="n">y</span><span class="p">,</span> <span class="n">y</span><span class="p">),</span> <span class="n">name</span><span class="o">=</span><span class="s2">&quot;b&quot;</span><span class="p">)</span>
<span class="n">c</span> <span class="o">=</span> <span class="n">a</span> <span class="o">+</span> <span class="n">b</span>  <span class="c1"># same as topi.broadcast_add</span>
<span class="n">d</span> <span class="o">=</span> <span class="n">a</span> <span class="o">*</span> <span class="n">b</span>  <span class="c1"># same as topi.broadcast_mul</span>
</pre></div>
</div>
<p>Overloaded with the same syntax, TOPI handles broadcasting a primitive (<cite>int</cite>, <cite>float</cite>) to a tensor <code class="code docutils literal notranslate"><span class="pre">d</span> <span class="pre">-</span> <span class="pre">3.14</span></code>.</p>
</div>
<div class="section" id="generic-schedules-and-fusing-operations">
<h2>Generic schedules and fusing operations<a class="headerlink" href="#generic-schedules-and-fusing-operations" title="永久链接至标题">¶</a></h2>
<p>Up to now, we have seen an example of how TOPI can save us from writing explicit computations in lower level API.
But it doesn’t stop here. Still we did the scheduling as before. TOPI also provides higher level
scheduling recipes depending on a given context. For example, for CUDA,
we can schedule the following series of operations ending with <code class="code docutils literal notranslate"><span class="pre">topi.sum</span></code> using only
<code class="code docutils literal notranslate"><span class="pre">topi.generic.schedule_reduce</span></code></p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">e</span> <span class="o">=</span> <span class="n">topi</span><span class="o">.</span><span class="n">elemwise_sum</span><span class="p">([</span><span class="n">c</span><span class="p">,</span> <span class="n">d</span><span class="p">])</span>
<span class="n">f</span> <span class="o">=</span> <span class="n">e</span> <span class="o">/</span> <span class="mf">2.0</span>
<span class="n">g</span> <span class="o">=</span> <span class="n">topi</span><span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="n">f</span><span class="p">)</span>
<span class="k">with</span> <span class="n">tvm</span><span class="o">.</span><span class="n">target</span><span class="o">.</span><span class="n">cuda</span><span class="p">():</span>
    <span class="n">sg</span> <span class="o">=</span> <span class="n">topi</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">schedule_reduce</span><span class="p">(</span><span class="n">g</span><span class="p">)</span>
    <span class="nb">print</span><span class="p">(</span><span class="n">tvm</span><span class="o">.</span><span class="n">lower</span><span class="p">(</span><span class="n">sg</span><span class="p">,</span> <span class="p">[</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">],</span> <span class="n">simple_mode</span><span class="o">=</span><span class="kc">True</span><span class="p">))</span>
</pre></div>
</div>
<p class="sphx-glr-script-out">输出:</p>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>primfn(a_1: handle, b_1: handle) -&gt; ()
  attr = {&quot;from_legacy_te_schedule&quot;: True, &quot;global_symbol&quot;: &quot;main&quot;, &quot;tir.noalias&quot;: True}
  buffers = {b: Buffer(b_2: Pointer(float32), float32, [10, 10], []),
             a: Buffer(a_2: Pointer(float32), float32, [100, 10, 10], [])}
  buffer_map = {a_1: a, b_1: b} {
  allocate(T_divide_red: Pointer(global float32), float32, [1]), storage_scope = global;
  attr [IterVar(threadIdx.x: int32, [0:1024], &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 1024;
  allocate(T_divide_red.rf: Pointer(local float32), float32, [1]), storage_scope = local;
  allocate(reduce_temp0: Pointer(local float32), float32, [1]), storage_scope = local {
    T_divide_red.rf[0] = 0f32
    for (k0.k1.fused.k2.fused.outer: int32, 0, 10) {
      if @tir.likely((((((k0.k1.fused.k2.fused.outer*1024) + threadIdx.x) &lt; 10000) &amp;&amp; (((k0.k1.fused.k2.fused.outer*1024) + threadIdx.x) &lt; 10000)) &amp;&amp; (((k0.k1.fused.k2.fused.outer*1024) + threadIdx.x) &lt; 10000)), dtype=bool) {
        T_divide_red.rf[0] = ((float32*)T_divide_red.rf[0] + ((((float32*)a_2[((k0.k1.fused.k2.fused.outer*1024) + threadIdx.x)] + (float32*)b_2[floormod(((k0.k1.fused.k2.fused.outer*1024) + threadIdx.x), 100)]) + ((float32*)a_2[((k0.k1.fused.k2.fused.outer*1024) + threadIdx.x)]*(float32*)b_2[floormod(((k0.k1.fused.k2.fused.outer*1024) + threadIdx.x), 100)]))*0.5f32))
      }
    }
    attr [meta[tir.CommReducer][0]] &quot;reduce_scope&quot; = @tir.reinterpret(0u64, dtype=handle);
    @tir.tvm_thread_allreduce(1u32, (float32*)T_divide_red.rf[0], True, reduce_temp0, threadIdx.x, dtype=handle)
    if (threadIdx.x == 0) {
      T_divide_red[0] = (float32*)reduce_temp0[0]
    }
  }
}
</pre></div>
</div>
<p>As you can see, scheduled stages of computation have been accumulated and we can examine them by</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="nb">print</span><span class="p">(</span><span class="n">sg</span><span class="o">.</span><span class="n">stages</span><span class="p">)</span>
</pre></div>
</div>
<p class="sphx-glr-script-out">输出:</p>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>[stage(a, placeholder(a, 0xa4457d0)), stage(b, placeholder(b, 0x1473af10)), stage(T_add, compute(T_add, body=[(a[ax0, ax1, ax2] + b[ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(min=0, ext=10))], reduce_axis=[], tag=broadcast, attrs={})), stage(T_multiply, compute(T_multiply, body=[(a[ax0, ax1, ax2]*b[ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(min=0, ext=10))], reduce_axis=[], tag=broadcast, attrs={})), stage(T_elemwise_sum, compute(T_elemwise_sum, body=[(T_add[ax0, ax1, ax2] + T_multiply[ax0, ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(min=0, ext=10))], reduce_axis=[], tag=elemwise, attrs={})), stage(T_divide, compute(T_divide, body=[(T_elemwise_sum[ax0, ax1, ax2]/2f)], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(min=0, ext=10))], reduce_axis=[], tag=elemwise, attrs={})), stage(T_divide_red.rf, compute(T_divide_red.rf, body=[reduce(combiner=comm_reducer(result=[(x + y)], lhs=[x], rhs=[y], identity_element=[0f]), source=[T_divide[floordiv(floordiv((k0.k1.fused.k2.fused.inner + (k0.k1.fused.k2.fused.outer*1024)), 10), 10), floormod(floordiv((k0.k1.fused.k2.fused.inner + (k0.k1.fused.k2.fused.outer*1024)), 10), 10), floormod((k0.k1.fused.k2.fused.inner + (k0.k1.fused.k2.fused.outer*1024)), 10)]], init=[], axis=[iter_var(k0.k1.fused.k2.fused.outer, range(min=0, ext=10))], where=tir.likely((((floordiv(floordiv((k0.k1.fused.k2.fused.inner + (k0.k1.fused.k2.fused.outer*1024)), 10), 10) &lt; 100) &amp;&amp; (floordiv((k0.k1.fused.k2.fused.inner + (k0.k1.fused.k2.fused.outer*1024)), 10) &lt; 1000)) &amp;&amp; ((k0.k1.fused.k2.fused.inner + (k0.k1.fused.k2.fused.outer*1024)) &lt; 10000))), value_index=0)], axis=[iter_var(k0.k1.fused.k2.fused.inner, range(min=0, ext=1024))], reduce_axis=[iter_var(k0.k1.fused.k2.fused.outer, range(min=0, ext=10))], tag=, attrs={})), stage(T_divide_red, compute(T_divide_red.repl, body=[reduce(combiner=comm_reducer(result=[(x + y)], lhs=[x], rhs=[y], identity_element=[0f]), source=[T_divide_red.rf[k0.k1.fused.k2.fused.inner.v]], init=[], axis=[iter_var(k0.k1.fused.k2.fused.inner.v, range(min=0, ext=1024))], where=(bool)1, value_index=0)], axis=[], reduce_axis=[iter_var(k0.k1.fused.k2.fused.inner.v, range(min=0, ext=1024))], tag=, attrs={}))]
</pre></div>
</div>
<p>We can test the correctness by comparing with <code class="code docutils literal notranslate"><span class="pre">numpy</span></code> result as follows</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">func</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">build</span><span class="p">(</span><span class="n">sg</span><span class="p">,</span> <span class="p">[</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">g</span><span class="p">],</span> <span class="s2">&quot;cuda&quot;</span><span class="p">)</span>
<span class="n">dev</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">cuda</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span>
<span class="n">a_np</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">uniform</span><span class="p">(</span><span class="n">size</span><span class="o">=</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">,</span> <span class="n">y</span><span class="p">))</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">a</span><span class="o">.</span><span class="n">dtype</span><span class="p">)</span>
<span class="n">b_np</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">uniform</span><span class="p">(</span><span class="n">size</span><span class="o">=</span><span class="p">(</span><span class="n">y</span><span class="p">,</span> <span class="n">y</span><span class="p">))</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">b</span><span class="o">.</span><span class="n">dtype</span><span class="p">)</span>
<span class="n">g_np</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">a_np</span> <span class="o">+</span> <span class="n">b_np</span><span class="p">,</span> <span class="n">a_np</span> <span class="o">*</span> <span class="n">b_np</span><span class="p">)</span> <span class="o">/</span> <span class="mf">2.0</span><span class="p">)</span>
<span class="n">a_nd</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">nd</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="n">a_np</span><span class="p">,</span> <span class="n">dev</span><span class="p">)</span>
<span class="n">b_nd</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">nd</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="n">b_np</span><span class="p">,</span> <span class="n">dev</span><span class="p">)</span>
<span class="n">g_nd</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">nd</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">zeros</span><span class="p">(</span><span class="n">g_np</span><span class="o">.</span><span class="n">shape</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">g_np</span><span class="o">.</span><span class="n">dtype</span><span class="p">),</span> <span class="n">dev</span><span class="p">)</span>
<span class="n">func</span><span class="p">(</span><span class="n">a_nd</span><span class="p">,</span> <span class="n">b_nd</span><span class="p">,</span> <span class="n">g_nd</span><span class="p">)</span>
<span class="n">tvm</span><span class="o">.</span><span class="n">testing</span><span class="o">.</span><span class="n">assert_allclose</span><span class="p">(</span><span class="n">g_nd</span><span class="o">.</span><span class="n">numpy</span><span class="p">(),</span> <span class="n">g_np</span><span class="p">,</span> <span class="n">rtol</span><span class="o">=</span><span class="mf">1e-5</span><span class="p">)</span>
</pre></div>
</div>
<p>TOPI also provides common neural nets operations such as _softmax_ with optimized schedule</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">tarray</span> <span class="o">=</span> <span class="n">te</span><span class="o">.</span><span class="n">placeholder</span><span class="p">((</span><span class="mi">512</span><span class="p">,</span> <span class="mi">512</span><span class="p">),</span> <span class="n">name</span><span class="o">=</span><span class="s2">&quot;tarray&quot;</span><span class="p">)</span>
<span class="n">softmax_topi</span> <span class="o">=</span> <span class="n">topi</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">softmax</span><span class="p">(</span><span class="n">tarray</span><span class="p">)</span>
<span class="k">with</span> <span class="n">tvm</span><span class="o">.</span><span class="n">target</span><span class="o">.</span><span class="n">Target</span><span class="p">(</span><span class="s2">&quot;cuda&quot;</span><span class="p">):</span>
    <span class="n">sst</span> <span class="o">=</span> <span class="n">topi</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">schedule_softmax</span><span class="p">(</span><span class="n">softmax_topi</span><span class="p">)</span>
    <span class="nb">print</span><span class="p">(</span><span class="n">tvm</span><span class="o">.</span><span class="n">lower</span><span class="p">(</span><span class="n">sst</span><span class="p">,</span> <span class="p">[</span><span class="n">tarray</span><span class="p">],</span> <span class="n">simple_mode</span><span class="o">=</span><span class="kc">True</span><span class="p">))</span>
</pre></div>
</div>
<p class="sphx-glr-script-out">输出:</p>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>primfn(tarray_1: handle) -&gt; ()
  attr = {&quot;from_legacy_te_schedule&quot;: True, &quot;global_symbol&quot;: &quot;main&quot;, &quot;tir.noalias&quot;: True}
  buffers = {tarray: Buffer(tarray_2: Pointer(float32), float32, [512, 512], [])}
  buffer_map = {tarray_1: tarray} {
  allocate(T_softmax_norm: Pointer(global float32x4), float32x4, [65536]), storage_scope = global;
  attr [IterVar(blockIdx.x: int32, (nullptr), &quot;ThreadIndex&quot;, &quot;blockIdx.x&quot;)] &quot;thread_extent&quot; = 512;
  allocate(normal_reduce_temp0: Pointer(local float32), float32, [1]), storage_scope = local;
  allocate(reduce_temp0: Pointer(local float32), float32, [1]), storage_scope = local;
  allocate(T_softmax_exp: Pointer(warp float32), float32, [512]), storage_scope = warp;
  allocate(normal_reduce_temp0_1: Pointer(local float32), float32, [1]), storage_scope = local;
  allocate(reduce_temp0_1: Pointer(local float32), float32, [1]), storage_scope = local {
    attr [IterVar(threadIdx.x: int32, [0:32], &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 32 {
      normal_reduce_temp0[0] = -3.40282e+38f32
      for (k.inner: int32, 0, 16) {
        normal_reduce_temp0[0] = max((float32*)normal_reduce_temp0[0], (float32*)tarray_2[(((blockIdx.x*512) + (threadIdx.x*16)) + k.inner)])
      }
      attr [meta[tir.CommReducer][0]] &quot;reduce_scope&quot; = @tir.reinterpret(0u64, dtype=handle);
      @tir.tvm_thread_allreduce(1u32, (float32*)normal_reduce_temp0[0], True, reduce_temp0, threadIdx.x, dtype=handle)
      for (i1.inner.outer: int32, 0, 4) {
        T_softmax_exp[ramp(((threadIdx.x*16) + (i1.inner.outer*4)), 1, 4)] = @tir.exp(((float32x4*)tarray_2[ramp((((blockIdx.x*512) + (threadIdx.x*16)) + (i1.inner.outer*4)), 1, 4)] - broadcast((float32*)reduce_temp0[0], 4)), dtype=float32x4)
      }
    }
    attr [IterVar(threadIdx.x, [0:32], &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 32 {
      normal_reduce_temp0_1[0] = 0f32
      for (k.inner_1: int32, 0, 16) {
        normal_reduce_temp0_1[0] = ((float32*)normal_reduce_temp0_1[0] + (float32*)T_softmax_exp[((threadIdx.x*16) + k.inner_1)])
      }
      attr [meta[tir.CommReducer][1]] &quot;reduce_scope&quot; = @tir.reinterpret(0u64, dtype=handle);
      @tir.tvm_thread_allreduce(1u32, (float32*)normal_reduce_temp0_1[0], True, reduce_temp0_1, threadIdx.x, dtype=handle)
      for (i1.inner.outer_1: int32, 0, 4) {
        T_softmax_norm[ramp((((blockIdx.x*512) + (threadIdx.x*16)) + (i1.inner.outer_1*4)), 1, 4)] = ((float32x4*)T_softmax_exp[ramp(((threadIdx.x*16) + (i1.inner.outer_1*4)), 1, 4)] / broadcast((float32*)reduce_temp0_1[0], 4))
      }
    }
  }
}
</pre></div>
</div>
</div>
<div class="section" id="fusing-convolutions">
<h2>Fusing convolutions<a class="headerlink" href="#fusing-convolutions" title="永久链接至标题">¶</a></h2>
<p>We can fuse <code class="code docutils literal notranslate"><span class="pre">topi.nn.conv2d</span></code> and <code class="code docutils literal notranslate"><span class="pre">topi.nn.relu</span></code> together.</p>
<div class="admonition note">
<p class="admonition-title">注解</p>
<p>TOPI functions are all generic functions. They have different implementations
for different backends to optimize for performance.
For each backend, it is necessary to call them under a target scope for both
compute declaration and schedule. TVM will choose the right function to call with
the target information.</p>
</div>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">data</span> <span class="o">=</span> <span class="n">te</span><span class="o">.</span><span class="n">placeholder</span><span class="p">((</span><span class="mi">1</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">224</span><span class="p">,</span> <span class="mi">224</span><span class="p">))</span>
<span class="n">kernel</span> <span class="o">=</span> <span class="n">te</span><span class="o">.</span><span class="n">placeholder</span><span class="p">((</span><span class="mi">10</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mi">5</span><span class="p">))</span>

<span class="k">with</span> <span class="n">tvm</span><span class="o">.</span><span class="n">target</span><span class="o">.</span><span class="n">Target</span><span class="p">(</span><span class="s2">&quot;cuda&quot;</span><span class="p">):</span>
    <span class="n">conv</span> <span class="o">=</span> <span class="n">topi</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">conv2d_nchw</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="n">kernel</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span>
    <span class="n">out</span> <span class="o">=</span> <span class="n">topi</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">relu</span><span class="p">(</span><span class="n">conv</span><span class="p">)</span>
    <span class="n">sconv</span> <span class="o">=</span> <span class="n">topi</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">schedule_conv2d_nchw</span><span class="p">([</span><span class="n">out</span><span class="p">])</span>
    <span class="nb">print</span><span class="p">(</span><span class="n">tvm</span><span class="o">.</span><span class="n">lower</span><span class="p">(</span><span class="n">sconv</span><span class="p">,</span> <span class="p">[</span><span class="n">data</span><span class="p">,</span> <span class="n">kernel</span><span class="p">],</span> <span class="n">simple_mode</span><span class="o">=</span><span class="kc">True</span><span class="p">))</span>
</pre></div>
</div>
<p class="sphx-glr-script-out">输出:</p>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>primfn(placeholder_2: handle, placeholder_3: handle) -&gt; ()
  attr = {&quot;from_legacy_te_schedule&quot;: True, &quot;global_symbol&quot;: &quot;main&quot;, &quot;tir.noalias&quot;: True}
  buffers = {placeholder_1: Buffer(placeholder_4: Pointer(float32), float32, [10, 3, 5, 5], []),
             placeholder: Buffer(placeholder_5: Pointer(float32), float32, [1, 3, 224, 224], [])}
  buffer_map = {placeholder_2: placeholder, placeholder_3: placeholder_1} {
  allocate(compute: Pointer(global float32), float32, [501760]), storage_scope = global;
  attr [IterVar(blockIdx.z: int32, (nullptr), &quot;ThreadIndex&quot;, &quot;blockIdx.z&quot;)] &quot;thread_extent&quot; = 5;
  allocate(compute_1: Pointer(local float32), float32, [14]), storage_scope = local;
  allocate(pad_temp.shared: Pointer(shared float32), float32, [112]), storage_scope = shared;
  allocate(placeholder.shared: Pointer(shared float32), float32, [2]), storage_scope = shared;
  attr [IterVar(blockIdx.y: int32, (nullptr), &quot;ThreadIndex&quot;, &quot;blockIdx.y&quot;)] &quot;thread_extent&quot; = 224;
  attr [IterVar(blockIdx.x: int32, (nullptr), &quot;ThreadIndex&quot;, &quot;blockIdx.x&quot;)] &quot;thread_extent&quot; = 2;
  attr [IterVar(threadIdx.z: int32, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.z&quot;)] &quot;thread_extent&quot; = 1;
  attr [IterVar(threadIdx.y: int32, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.y&quot;)] &quot;thread_extent&quot; = 1;
  attr [IterVar(threadIdx.x: int32, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 16 {
    compute_1[0] = 0f32
    compute_1[2] = 0f32
    compute_1[4] = 0f32
    compute_1[6] = 0f32
    compute_1[8] = 0f32
    compute_1[10] = 0f32
    compute_1[12] = 0f32
    compute_1[1] = 0f32
    compute_1[3] = 0f32
    compute_1[5] = 0f32
    compute_1[7] = 0f32
    compute_1[9] = 0f32
    compute_1[11] = 0f32
    compute_1[13] = 0f32
    for (rc.outer: int32, 0, 3) {
      for (ry.outer: int32, 0, 5) {
        attr [IterVar(threadIdx.z_1: int32, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.z&quot;)] &quot;thread_extent&quot; = 1;
        attr [IterVar(threadIdx.y_1: int32, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.y&quot;)] &quot;thread_extent&quot; = 1;
        attr [IterVar(threadIdx.x_1: int32, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 16 {
          pad_temp.shared[(threadIdx.x_1*7)] = @tir.if_then_else((((2 &lt;= (blockIdx.y + ry.outer)) &amp;&amp; ((blockIdx.y + ry.outer) &lt; 226)) &amp;&amp; (2 &lt;= ((blockIdx.x*112) + (threadIdx.x_1*7)))), (float32*)placeholder_5[((((((rc.outer*50176) + (blockIdx.y*224)) + (ry.outer*224)) + (blockIdx.x*112)) + (threadIdx.x_1*7)) - 450)], 0f32, dtype=float32)
          pad_temp.shared[((threadIdx.x_1*7) + 1)] = @tir.if_then_else((((2 &lt;= (blockIdx.y + ry.outer)) &amp;&amp; ((blockIdx.y + ry.outer) &lt; 226)) &amp;&amp; (1 &lt;= ((blockIdx.x*112) + (threadIdx.x_1*7)))), (float32*)placeholder_5[((((((rc.outer*50176) + (blockIdx.y*224)) + (ry.outer*224)) + (blockIdx.x*112)) + (threadIdx.x_1*7)) - 449)], 0f32, dtype=float32)
          pad_temp.shared[((threadIdx.x_1*7) + 2)] = @tir.if_then_else(((2 &lt;= (blockIdx.y + ry.outer)) &amp;&amp; ((blockIdx.y + ry.outer) &lt; 226)), (float32*)placeholder_5[((((((rc.outer*50176) + (blockIdx.y*224)) + (ry.outer*224)) + (blockIdx.x*112)) + (threadIdx.x_1*7)) - 448)], 0f32, dtype=float32)
          pad_temp.shared[((threadIdx.x_1*7) + 3)] = @tir.if_then_else(((2 &lt;= (blockIdx.y + ry.outer)) &amp;&amp; ((blockIdx.y + ry.outer) &lt; 226)), (float32*)placeholder_5[((((((rc.outer*50176) + (blockIdx.y*224)) + (ry.outer*224)) + (blockIdx.x*112)) + (threadIdx.x_1*7)) - 447)], 0f32, dtype=float32)
          pad_temp.shared[((threadIdx.x_1*7) + 4)] = @tir.if_then_else(((2 &lt;= (blockIdx.y + ry.outer)) &amp;&amp; ((blockIdx.y + ry.outer) &lt; 226)), (float32*)placeholder_5[((((((rc.outer*50176) + (blockIdx.y*224)) + (ry.outer*224)) + (blockIdx.x*112)) + (threadIdx.x_1*7)) - 446)], 0f32, dtype=float32)
          pad_temp.shared[((threadIdx.x_1*7) + 5)] = @tir.if_then_else(((2 &lt;= (blockIdx.y + ry.outer)) &amp;&amp; ((blockIdx.y + ry.outer) &lt; 226)), (float32*)placeholder_5[((((((rc.outer*50176) + (blockIdx.y*224)) + (ry.outer*224)) + (blockIdx.x*112)) + (threadIdx.x_1*7)) - 445)], 0f32, dtype=float32)
          pad_temp.shared[((threadIdx.x_1*7) + 6)] = @tir.if_then_else(((2 &lt;= (blockIdx.y + ry.outer)) &amp;&amp; ((blockIdx.y + ry.outer) &lt; 226)), (float32*)placeholder_5[((((((rc.outer*50176) + (blockIdx.y*224)) + (ry.outer*224)) + (blockIdx.x*112)) + (threadIdx.x_1*7)) - 444)], 0f32, dtype=float32)
        }
        attr [IterVar(threadIdx.z_2: int32, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.z&quot;)] &quot;thread_extent&quot; = 1;
        attr [IterVar(threadIdx.y_2: int32, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.y&quot;)] &quot;thread_extent&quot; = 1;
        attr [IterVar(threadIdx.x_2: int32, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 16;
        if @tir.likely((threadIdx.x_2 &lt; 2), dtype=bool) {
          placeholder.shared[threadIdx.x_2] = (float32*)placeholder_4[((((blockIdx.z*150) + (threadIdx.x_2*75)) + (rc.outer*25)) + (ry.outer*5))]
        }
        compute_1[0] = ((float32*)compute_1[0] + ((float32*)pad_temp.shared[threadIdx.x]*(float32*)placeholder.shared[0]))
        compute_1[2] = ((float32*)compute_1[2] + ((float32*)pad_temp.shared[(threadIdx.x + 16)]*(float32*)placeholder.shared[0]))
        compute_1[4] = ((float32*)compute_1[4] + ((float32*)pad_temp.shared[(threadIdx.x + 32)]*(float32*)placeholder.shared[0]))
        compute_1[6] = ((float32*)compute_1[6] + ((float32*)pad_temp.shared[(threadIdx.x + 48)]*(float32*)placeholder.shared[0]))
        compute_1[8] = ((float32*)compute_1[8] + ((float32*)pad_temp.shared[(threadIdx.x + 64)]*(float32*)placeholder.shared[0]))
        compute_1[10] = ((float32*)compute_1[10] + ((float32*)pad_temp.shared[(threadIdx.x + 80)]*(float32*)placeholder.shared[0]))
        compute_1[12] = ((float32*)compute_1[12] + ((float32*)pad_temp.shared[(threadIdx.x + 96)]*(float32*)placeholder.shared[0]))
        compute_1[1] = ((float32*)compute_1[1] + ((float32*)pad_temp.shared[threadIdx.x]*(float32*)placeholder.shared[1]))
        compute_1[3] = ((float32*)compute_1[3] + ((float32*)pad_temp.shared[(threadIdx.x + 16)]*(float32*)placeholder.shared[1]))
        compute_1[5] = ((float32*)compute_1[5] + ((float32*)pad_temp.shared[(threadIdx.x + 32)]*(float32*)placeholder.shared[1]))
        compute_1[7] = ((float32*)compute_1[7] + ((float32*)pad_temp.shared[(threadIdx.x + 48)]*(float32*)placeholder.shared[1]))
        compute_1[9] = ((float32*)compute_1[9] + ((float32*)pad_temp.shared[(threadIdx.x + 64)]*(float32*)placeholder.shared[1]))
        compute_1[11] = ((float32*)compute_1[11] + ((float32*)pad_temp.shared[(threadIdx.x + 80)]*(float32*)placeholder.shared[1]))
        compute_1[13] = ((float32*)compute_1[13] + ((float32*)pad_temp.shared[(threadIdx.x + 96)]*(float32*)placeholder.shared[1]))
        attr [IterVar(threadIdx.z_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.z&quot;)] &quot;thread_extent&quot; = 1;
        attr [IterVar(threadIdx.y_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.y&quot;)] &quot;thread_extent&quot; = 1;
        attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 16 {
          pad_temp.shared[(threadIdx.x_1*7)] = @tir.if_then_else((((2 &lt;= (blockIdx.y + ry.outer)) &amp;&amp; ((blockIdx.y + ry.outer) &lt; 226)) &amp;&amp; (1 &lt;= ((blockIdx.x*112) + (threadIdx.x_1*7)))), (float32*)placeholder_5[((((((rc.outer*50176) + (blockIdx.y*224)) + (ry.outer*224)) + (blockIdx.x*112)) + (threadIdx.x_1*7)) - 449)], 0f32, dtype=float32)
          pad_temp.shared[((threadIdx.x_1*7) + 1)] = @tir.if_then_else(((2 &lt;= (blockIdx.y + ry.outer)) &amp;&amp; ((blockIdx.y + ry.outer) &lt; 226)), (float32*)placeholder_5[((((((rc.outer*50176) + (blockIdx.y*224)) + (ry.outer*224)) + (blockIdx.x*112)) + (threadIdx.x_1*7)) - 448)], 0f32, dtype=float32)
          pad_temp.shared[((threadIdx.x_1*7) + 2)] = @tir.if_then_else(((2 &lt;= (blockIdx.y + ry.outer)) &amp;&amp; ((blockIdx.y + ry.outer) &lt; 226)), (float32*)placeholder_5[((((((rc.outer*50176) + (blockIdx.y*224)) + (ry.outer*224)) + (blockIdx.x*112)) + (threadIdx.x_1*7)) - 447)], 0f32, dtype=float32)
          pad_temp.shared[((threadIdx.x_1*7) + 3)] = @tir.if_then_else(((2 &lt;= (blockIdx.y + ry.outer)) &amp;&amp; ((blockIdx.y + ry.outer) &lt; 226)), (float32*)placeholder_5[((((((rc.outer*50176) + (blockIdx.y*224)) + (ry.outer*224)) + (blockIdx.x*112)) + (threadIdx.x_1*7)) - 446)], 0f32, dtype=float32)
          pad_temp.shared[((threadIdx.x_1*7) + 4)] = @tir.if_then_else(((2 &lt;= (blockIdx.y + ry.outer)) &amp;&amp; ((blockIdx.y + ry.outer) &lt; 226)), (float32*)placeholder_5[((((((rc.outer*50176) + (blockIdx.y*224)) + (ry.outer*224)) + (blockIdx.x*112)) + (threadIdx.x_1*7)) - 445)], 0f32, dtype=float32)
          pad_temp.shared[((threadIdx.x_1*7) + 5)] = @tir.if_then_else(((2 &lt;= (blockIdx.y + ry.outer)) &amp;&amp; ((blockIdx.y + ry.outer) &lt; 226)), (float32*)placeholder_5[((((((rc.outer*50176) + (blockIdx.y*224)) + (ry.outer*224)) + (blockIdx.x*112)) + (threadIdx.x_1*7)) - 444)], 0f32, dtype=float32)
          pad_temp.shared[((threadIdx.x_1*7) + 6)] = @tir.if_then_else(((2 &lt;= (blockIdx.y + ry.outer)) &amp;&amp; ((blockIdx.y + ry.outer) &lt; 226)), (float32*)placeholder_5[((((((rc.outer*50176) + (blockIdx.y*224)) + (ry.outer*224)) + (blockIdx.x*112)) + (threadIdx.x_1*7)) - 443)], 0f32, dtype=float32)
        }
        attr [IterVar(threadIdx.z_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.z&quot;)] &quot;thread_extent&quot; = 1;
        attr [IterVar(threadIdx.y_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.y&quot;)] &quot;thread_extent&quot; = 1;
        attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 16;
        if @tir.likely((threadIdx.x_2 &lt; 2), dtype=bool) {
          placeholder.shared[threadIdx.x_2] = (float32*)placeholder_4[(((((blockIdx.z*150) + (threadIdx.x_2*75)) + (rc.outer*25)) + (ry.outer*5)) + 1)]
        }
        compute_1[0] = ((float32*)compute_1[0] + ((float32*)pad_temp.shared[threadIdx.x]*(float32*)placeholder.shared[0]))
        compute_1[2] = ((float32*)compute_1[2] + ((float32*)pad_temp.shared[(threadIdx.x + 16)]*(float32*)placeholder.shared[0]))
        compute_1[4] = ((float32*)compute_1[4] + ((float32*)pad_temp.shared[(threadIdx.x + 32)]*(float32*)placeholder.shared[0]))
        compute_1[6] = ((float32*)compute_1[6] + ((float32*)pad_temp.shared[(threadIdx.x + 48)]*(float32*)placeholder.shared[0]))
        compute_1[8] = ((float32*)compute_1[8] + ((float32*)pad_temp.shared[(threadIdx.x + 64)]*(float32*)placeholder.shared[0]))
        compute_1[10] = ((float32*)compute_1[10] + ((float32*)pad_temp.shared[(threadIdx.x + 80)]*(float32*)placeholder.shared[0]))
        compute_1[12] = ((float32*)compute_1[12] + ((float32*)pad_temp.shared[(threadIdx.x + 96)]*(float32*)placeholder.shared[0]))
        compute_1[1] = ((float32*)compute_1[1] + ((float32*)pad_temp.shared[threadIdx.x]*(float32*)placeholder.shared[1]))
        compute_1[3] = ((float32*)compute_1[3] + ((float32*)pad_temp.shared[(threadIdx.x + 16)]*(float32*)placeholder.shared[1]))
        compute_1[5] = ((float32*)compute_1[5] + ((float32*)pad_temp.shared[(threadIdx.x + 32)]*(float32*)placeholder.shared[1]))
        compute_1[7] = ((float32*)compute_1[7] + ((float32*)pad_temp.shared[(threadIdx.x + 48)]*(float32*)placeholder.shared[1]))
        compute_1[9] = ((float32*)compute_1[9] + ((float32*)pad_temp.shared[(threadIdx.x + 64)]*(float32*)placeholder.shared[1]))
        compute_1[11] = ((float32*)compute_1[11] + ((float32*)pad_temp.shared[(threadIdx.x + 80)]*(float32*)placeholder.shared[1]))
        compute_1[13] = ((float32*)compute_1[13] + ((float32*)pad_temp.shared[(threadIdx.x + 96)]*(float32*)placeholder.shared[1]))
        attr [IterVar(threadIdx.z_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.z&quot;)] &quot;thread_extent&quot; = 1;
        attr [IterVar(threadIdx.y_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.y&quot;)] &quot;thread_extent&quot; = 1;
        attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 16 {
          pad_temp.shared[(threadIdx.x_1*7)] = @tir.if_then_else(((2 &lt;= (blockIdx.y + ry.outer)) &amp;&amp; ((blockIdx.y + ry.outer) &lt; 226)), (float32*)placeholder_5[((((((rc.outer*50176) + (blockIdx.y*224)) + (ry.outer*224)) + (blockIdx.x*112)) + (threadIdx.x_1*7)) - 448)], 0f32, dtype=float32)
          pad_temp.shared[((threadIdx.x_1*7) + 1)] = @tir.if_then_else(((2 &lt;= (blockIdx.y + ry.outer)) &amp;&amp; ((blockIdx.y + ry.outer) &lt; 226)), (float32*)placeholder_5[((((((rc.outer*50176) + (blockIdx.y*224)) + (ry.outer*224)) + (blockIdx.x*112)) + (threadIdx.x_1*7)) - 447)], 0f32, dtype=float32)
          pad_temp.shared[((threadIdx.x_1*7) + 2)] = @tir.if_then_else(((2 &lt;= (blockIdx.y + ry.outer)) &amp;&amp; ((blockIdx.y + ry.outer) &lt; 226)), (float32*)placeholder_5[((((((rc.outer*50176) + (blockIdx.y*224)) + (ry.outer*224)) + (blockIdx.x*112)) + (threadIdx.x_1*7)) - 446)], 0f32, dtype=float32)
          pad_temp.shared[((threadIdx.x_1*7) + 3)] = @tir.if_then_else(((2 &lt;= (blockIdx.y + ry.outer)) &amp;&amp; ((blockIdx.y + ry.outer) &lt; 226)), (float32*)placeholder_5[((((((rc.outer*50176) + (blockIdx.y*224)) + (ry.outer*224)) + (blockIdx.x*112)) + (threadIdx.x_1*7)) - 445)], 0f32, dtype=float32)
          pad_temp.shared[((threadIdx.x_1*7) + 4)] = @tir.if_then_else(((2 &lt;= (blockIdx.y + ry.outer)) &amp;&amp; ((blockIdx.y + ry.outer) &lt; 226)), (float32*)placeholder_5[((((((rc.outer*50176) + (blockIdx.y*224)) + (ry.outer*224)) + (blockIdx.x*112)) + (threadIdx.x_1*7)) - 444)], 0f32, dtype=float32)
          pad_temp.shared[((threadIdx.x_1*7) + 5)] = @tir.if_then_else(((2 &lt;= (blockIdx.y + ry.outer)) &amp;&amp; ((blockIdx.y + ry.outer) &lt; 226)), (float32*)placeholder_5[((((((rc.outer*50176) + (blockIdx.y*224)) + (ry.outer*224)) + (blockIdx.x*112)) + (threadIdx.x_1*7)) - 443)], 0f32, dtype=float32)
          pad_temp.shared[((threadIdx.x_1*7) + 6)] = @tir.if_then_else(((2 &lt;= (blockIdx.y + ry.outer)) &amp;&amp; ((blockIdx.y + ry.outer) &lt; 226)), (float32*)placeholder_5[((((((rc.outer*50176) + (blockIdx.y*224)) + (ry.outer*224)) + (blockIdx.x*112)) + (threadIdx.x_1*7)) - 442)], 0f32, dtype=float32)
        }
        attr [IterVar(threadIdx.z_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.z&quot;)] &quot;thread_extent&quot; = 1;
        attr [IterVar(threadIdx.y_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.y&quot;)] &quot;thread_extent&quot; = 1;
        attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 16;
        if @tir.likely((threadIdx.x_2 &lt; 2), dtype=bool) {
          placeholder.shared[threadIdx.x_2] = (float32*)placeholder_4[(((((blockIdx.z*150) + (threadIdx.x_2*75)) + (rc.outer*25)) + (ry.outer*5)) + 2)]
        }
        compute_1[0] = ((float32*)compute_1[0] + ((float32*)pad_temp.shared[threadIdx.x]*(float32*)placeholder.shared[0]))
        compute_1[2] = ((float32*)compute_1[2] + ((float32*)pad_temp.shared[(threadIdx.x + 16)]*(float32*)placeholder.shared[0]))
        compute_1[4] = ((float32*)compute_1[4] + ((float32*)pad_temp.shared[(threadIdx.x + 32)]*(float32*)placeholder.shared[0]))
        compute_1[6] = ((float32*)compute_1[6] + ((float32*)pad_temp.shared[(threadIdx.x + 48)]*(float32*)placeholder.shared[0]))
        compute_1[8] = ((float32*)compute_1[8] + ((float32*)pad_temp.shared[(threadIdx.x + 64)]*(float32*)placeholder.shared[0]))
        compute_1[10] = ((float32*)compute_1[10] + ((float32*)pad_temp.shared[(threadIdx.x + 80)]*(float32*)placeholder.shared[0]))
        compute_1[12] = ((float32*)compute_1[12] + ((float32*)pad_temp.shared[(threadIdx.x + 96)]*(float32*)placeholder.shared[0]))
        compute_1[1] = ((float32*)compute_1[1] + ((float32*)pad_temp.shared[threadIdx.x]*(float32*)placeholder.shared[1]))
        compute_1[3] = ((float32*)compute_1[3] + ((float32*)pad_temp.shared[(threadIdx.x + 16)]*(float32*)placeholder.shared[1]))
        compute_1[5] = ((float32*)compute_1[5] + ((float32*)pad_temp.shared[(threadIdx.x + 32)]*(float32*)placeholder.shared[1]))
        compute_1[7] = ((float32*)compute_1[7] + ((float32*)pad_temp.shared[(threadIdx.x + 48)]*(float32*)placeholder.shared[1]))
        compute_1[9] = ((float32*)compute_1[9] + ((float32*)pad_temp.shared[(threadIdx.x + 64)]*(float32*)placeholder.shared[1]))
        compute_1[11] = ((float32*)compute_1[11] + ((float32*)pad_temp.shared[(threadIdx.x + 80)]*(float32*)placeholder.shared[1]))
        compute_1[13] = ((float32*)compute_1[13] + ((float32*)pad_temp.shared[(threadIdx.x + 96)]*(float32*)placeholder.shared[1]))
        attr [IterVar(threadIdx.z_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.z&quot;)] &quot;thread_extent&quot; = 1;
        attr [IterVar(threadIdx.y_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.y&quot;)] &quot;thread_extent&quot; = 1;
        attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 16 {
          pad_temp.shared[(threadIdx.x_1*7)] = @tir.if_then_else(((2 &lt;= (blockIdx.y + ry.outer)) &amp;&amp; ((blockIdx.y + ry.outer) &lt; 226)), (float32*)placeholder_5[((((((rc.outer*50176) + (blockIdx.y*224)) + (ry.outer*224)) + (blockIdx.x*112)) + (threadIdx.x_1*7)) - 447)], 0f32, dtype=float32)
          pad_temp.shared[((threadIdx.x_1*7) + 1)] = @tir.if_then_else(((2 &lt;= (blockIdx.y + ry.outer)) &amp;&amp; ((blockIdx.y + ry.outer) &lt; 226)), (float32*)placeholder_5[((((((rc.outer*50176) + (blockIdx.y*224)) + (ry.outer*224)) + (blockIdx.x*112)) + (threadIdx.x_1*7)) - 446)], 0f32, dtype=float32)
          pad_temp.shared[((threadIdx.x_1*7) + 2)] = @tir.if_then_else(((2 &lt;= (blockIdx.y + ry.outer)) &amp;&amp; ((blockIdx.y + ry.outer) &lt; 226)), (float32*)placeholder_5[((((((rc.outer*50176) + (blockIdx.y*224)) + (ry.outer*224)) + (blockIdx.x*112)) + (threadIdx.x_1*7)) - 445)], 0f32, dtype=float32)
          pad_temp.shared[((threadIdx.x_1*7) + 3)] = @tir.if_then_else(((2 &lt;= (blockIdx.y + ry.outer)) &amp;&amp; ((blockIdx.y + ry.outer) &lt; 226)), (float32*)placeholder_5[((((((rc.outer*50176) + (blockIdx.y*224)) + (ry.outer*224)) + (blockIdx.x*112)) + (threadIdx.x_1*7)) - 444)], 0f32, dtype=float32)
          pad_temp.shared[((threadIdx.x_1*7) + 4)] = @tir.if_then_else(((2 &lt;= (blockIdx.y + ry.outer)) &amp;&amp; ((blockIdx.y + ry.outer) &lt; 226)), (float32*)placeholder_5[((((((rc.outer*50176) + (blockIdx.y*224)) + (ry.outer*224)) + (blockIdx.x*112)) + (threadIdx.x_1*7)) - 443)], 0f32, dtype=float32)
          pad_temp.shared[((threadIdx.x_1*7) + 5)] = @tir.if_then_else(((2 &lt;= (blockIdx.y + ry.outer)) &amp;&amp; ((blockIdx.y + ry.outer) &lt; 226)), (float32*)placeholder_5[((((((rc.outer*50176) + (blockIdx.y*224)) + (ry.outer*224)) + (blockIdx.x*112)) + (threadIdx.x_1*7)) - 442)], 0f32, dtype=float32)
          pad_temp.shared[((threadIdx.x_1*7) + 6)] = @tir.if_then_else((((2 &lt;= (blockIdx.y + ry.outer)) &amp;&amp; ((blockIdx.y + ry.outer) &lt; 226)) &amp;&amp; (((blockIdx.x*112) + (threadIdx.x_1*7)) &lt; 217)), (float32*)placeholder_5[((((((rc.outer*50176) + (blockIdx.y*224)) + (ry.outer*224)) + (blockIdx.x*112)) + (threadIdx.x_1*7)) - 441)], 0f32, dtype=float32)
        }
        attr [IterVar(threadIdx.z_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.z&quot;)] &quot;thread_extent&quot; = 1;
        attr [IterVar(threadIdx.y_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.y&quot;)] &quot;thread_extent&quot; = 1;
        attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 16;
        if @tir.likely((threadIdx.x_2 &lt; 2), dtype=bool) {
          placeholder.shared[threadIdx.x_2] = (float32*)placeholder_4[(((((blockIdx.z*150) + (threadIdx.x_2*75)) + (rc.outer*25)) + (ry.outer*5)) + 3)]
        }
        compute_1[0] = ((float32*)compute_1[0] + ((float32*)pad_temp.shared[threadIdx.x]*(float32*)placeholder.shared[0]))
        compute_1[2] = ((float32*)compute_1[2] + ((float32*)pad_temp.shared[(threadIdx.x + 16)]*(float32*)placeholder.shared[0]))
        compute_1[4] = ((float32*)compute_1[4] + ((float32*)pad_temp.shared[(threadIdx.x + 32)]*(float32*)placeholder.shared[0]))
        compute_1[6] = ((float32*)compute_1[6] + ((float32*)pad_temp.shared[(threadIdx.x + 48)]*(float32*)placeholder.shared[0]))
        compute_1[8] = ((float32*)compute_1[8] + ((float32*)pad_temp.shared[(threadIdx.x + 64)]*(float32*)placeholder.shared[0]))
        compute_1[10] = ((float32*)compute_1[10] + ((float32*)pad_temp.shared[(threadIdx.x + 80)]*(float32*)placeholder.shared[0]))
        compute_1[12] = ((float32*)compute_1[12] + ((float32*)pad_temp.shared[(threadIdx.x + 96)]*(float32*)placeholder.shared[0]))
        compute_1[1] = ((float32*)compute_1[1] + ((float32*)pad_temp.shared[threadIdx.x]*(float32*)placeholder.shared[1]))
        compute_1[3] = ((float32*)compute_1[3] + ((float32*)pad_temp.shared[(threadIdx.x + 16)]*(float32*)placeholder.shared[1]))
        compute_1[5] = ((float32*)compute_1[5] + ((float32*)pad_temp.shared[(threadIdx.x + 32)]*(float32*)placeholder.shared[1]))
        compute_1[7] = ((float32*)compute_1[7] + ((float32*)pad_temp.shared[(threadIdx.x + 48)]*(float32*)placeholder.shared[1]))
        compute_1[9] = ((float32*)compute_1[9] + ((float32*)pad_temp.shared[(threadIdx.x + 64)]*(float32*)placeholder.shared[1]))
        compute_1[11] = ((float32*)compute_1[11] + ((float32*)pad_temp.shared[(threadIdx.x + 80)]*(float32*)placeholder.shared[1]))
        compute_1[13] = ((float32*)compute_1[13] + ((float32*)pad_temp.shared[(threadIdx.x + 96)]*(float32*)placeholder.shared[1]))
        attr [IterVar(threadIdx.z_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.z&quot;)] &quot;thread_extent&quot; = 1;
        attr [IterVar(threadIdx.y_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.y&quot;)] &quot;thread_extent&quot; = 1;
        attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 16 {
          pad_temp.shared[(threadIdx.x_1*7)] = @tir.if_then_else(((2 &lt;= (blockIdx.y + ry.outer)) &amp;&amp; ((blockIdx.y + ry.outer) &lt; 226)), (float32*)placeholder_5[((((((rc.outer*50176) + (blockIdx.y*224)) + (ry.outer*224)) + (blockIdx.x*112)) + (threadIdx.x_1*7)) - 446)], 0f32, dtype=float32)
          pad_temp.shared[((threadIdx.x_1*7) + 1)] = @tir.if_then_else(((2 &lt;= (blockIdx.y + ry.outer)) &amp;&amp; ((blockIdx.y + ry.outer) &lt; 226)), (float32*)placeholder_5[((((((rc.outer*50176) + (blockIdx.y*224)) + (ry.outer*224)) + (blockIdx.x*112)) + (threadIdx.x_1*7)) - 445)], 0f32, dtype=float32)
          pad_temp.shared[((threadIdx.x_1*7) + 2)] = @tir.if_then_else(((2 &lt;= (blockIdx.y + ry.outer)) &amp;&amp; ((blockIdx.y + ry.outer) &lt; 226)), (float32*)placeholder_5[((((((rc.outer*50176) + (blockIdx.y*224)) + (ry.outer*224)) + (blockIdx.x*112)) + (threadIdx.x_1*7)) - 444)], 0f32, dtype=float32)
          pad_temp.shared[((threadIdx.x_1*7) + 3)] = @tir.if_then_else(((2 &lt;= (blockIdx.y + ry.outer)) &amp;&amp; ((blockIdx.y + ry.outer) &lt; 226)), (float32*)placeholder_5[((((((rc.outer*50176) + (blockIdx.y*224)) + (ry.outer*224)) + (blockIdx.x*112)) + (threadIdx.x_1*7)) - 443)], 0f32, dtype=float32)
          pad_temp.shared[((threadIdx.x_1*7) + 4)] = @tir.if_then_else(((2 &lt;= (blockIdx.y + ry.outer)) &amp;&amp; ((blockIdx.y + ry.outer) &lt; 226)), (float32*)placeholder_5[((((((rc.outer*50176) + (blockIdx.y*224)) + (ry.outer*224)) + (blockIdx.x*112)) + (threadIdx.x_1*7)) - 442)], 0f32, dtype=float32)
          pad_temp.shared[((threadIdx.x_1*7) + 5)] = @tir.if_then_else((((2 &lt;= (blockIdx.y + ry.outer)) &amp;&amp; ((blockIdx.y + ry.outer) &lt; 226)) &amp;&amp; (((blockIdx.x*112) + (threadIdx.x_1*7)) &lt; 217)), (float32*)placeholder_5[((((((rc.outer*50176) + (blockIdx.y*224)) + (ry.outer*224)) + (blockIdx.x*112)) + (threadIdx.x_1*7)) - 441)], 0f32, dtype=float32)
          pad_temp.shared[((threadIdx.x_1*7) + 6)] = @tir.if_then_else((((2 &lt;= (blockIdx.y + ry.outer)) &amp;&amp; ((blockIdx.y + ry.outer) &lt; 226)) &amp;&amp; (((blockIdx.x*112) + (threadIdx.x_1*7)) &lt; 216)), (float32*)placeholder_5[((((((rc.outer*50176) + (blockIdx.y*224)) + (ry.outer*224)) + (blockIdx.x*112)) + (threadIdx.x_1*7)) - 440)], 0f32, dtype=float32)
        }
        attr [IterVar(threadIdx.z_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.z&quot;)] &quot;thread_extent&quot; = 1;
        attr [IterVar(threadIdx.y_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.y&quot;)] &quot;thread_extent&quot; = 1;
        attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 16;
        if @tir.likely((threadIdx.x_2 &lt; 2), dtype=bool) {
          placeholder.shared[threadIdx.x_2] = (float32*)placeholder_4[(((((blockIdx.z*150) + (threadIdx.x_2*75)) + (rc.outer*25)) + (ry.outer*5)) + 4)]
        }
        compute_1[0] = ((float32*)compute_1[0] + ((float32*)pad_temp.shared[threadIdx.x]*(float32*)placeholder.shared[0]))
        compute_1[2] = ((float32*)compute_1[2] + ((float32*)pad_temp.shared[(threadIdx.x + 16)]*(float32*)placeholder.shared[0]))
        compute_1[4] = ((float32*)compute_1[4] + ((float32*)pad_temp.shared[(threadIdx.x + 32)]*(float32*)placeholder.shared[0]))
        compute_1[6] = ((float32*)compute_1[6] + ((float32*)pad_temp.shared[(threadIdx.x + 48)]*(float32*)placeholder.shared[0]))
        compute_1[8] = ((float32*)compute_1[8] + ((float32*)pad_temp.shared[(threadIdx.x + 64)]*(float32*)placeholder.shared[0]))
        compute_1[10] = ((float32*)compute_1[10] + ((float32*)pad_temp.shared[(threadIdx.x + 80)]*(float32*)placeholder.shared[0]))
        compute_1[12] = ((float32*)compute_1[12] + ((float32*)pad_temp.shared[(threadIdx.x + 96)]*(float32*)placeholder.shared[0]))
        compute_1[1] = ((float32*)compute_1[1] + ((float32*)pad_temp.shared[threadIdx.x]*(float32*)placeholder.shared[1]))
        compute_1[3] = ((float32*)compute_1[3] + ((float32*)pad_temp.shared[(threadIdx.x + 16)]*(float32*)placeholder.shared[1]))
        compute_1[5] = ((float32*)compute_1[5] + ((float32*)pad_temp.shared[(threadIdx.x + 32)]*(float32*)placeholder.shared[1]))
        compute_1[7] = ((float32*)compute_1[7] + ((float32*)pad_temp.shared[(threadIdx.x + 48)]*(float32*)placeholder.shared[1]))
        compute_1[9] = ((float32*)compute_1[9] + ((float32*)pad_temp.shared[(threadIdx.x + 64)]*(float32*)placeholder.shared[1]))
        compute_1[11] = ((float32*)compute_1[11] + ((float32*)pad_temp.shared[(threadIdx.x + 80)]*(float32*)placeholder.shared[1]))
        compute_1[13] = ((float32*)compute_1[13] + ((float32*)pad_temp.shared[(threadIdx.x + 96)]*(float32*)placeholder.shared[1]))
      }
    }
    compute[((((blockIdx.z*100352) + (blockIdx.y*224)) + (blockIdx.x*112)) + threadIdx.x)] = max((float32*)compute_1[0], 0f32)
    compute[(((((blockIdx.z*100352) + (blockIdx.y*224)) + (blockIdx.x*112)) + threadIdx.x) + 16)] = max((float32*)compute_1[2], 0f32)
    compute[(((((blockIdx.z*100352) + (blockIdx.y*224)) + (blockIdx.x*112)) + threadIdx.x) + 32)] = max((float32*)compute_1[4], 0f32)
    compute[(((((blockIdx.z*100352) + (blockIdx.y*224)) + (blockIdx.x*112)) + threadIdx.x) + 48)] = max((float32*)compute_1[6], 0f32)
    compute[(((((blockIdx.z*100352) + (blockIdx.y*224)) + (blockIdx.x*112)) + threadIdx.x) + 64)] = max((float32*)compute_1[8], 0f32)
    compute[(((((blockIdx.z*100352) + (blockIdx.y*224)) + (blockIdx.x*112)) + threadIdx.x) + 80)] = max((float32*)compute_1[10], 0f32)
    compute[(((((blockIdx.z*100352) + (blockIdx.y*224)) + (blockIdx.x*112)) + threadIdx.x) + 96)] = max((float32*)compute_1[12], 0f32)
    compute[(((((blockIdx.z*100352) + (blockIdx.y*224)) + (blockIdx.x*112)) + threadIdx.x) + 50176)] = max((float32*)compute_1[1], 0f32)
    compute[(((((blockIdx.z*100352) + (blockIdx.y*224)) + (blockIdx.x*112)) + threadIdx.x) + 50192)] = max((float32*)compute_1[3], 0f32)
    compute[(((((blockIdx.z*100352) + (blockIdx.y*224)) + (blockIdx.x*112)) + threadIdx.x) + 50208)] = max((float32*)compute_1[5], 0f32)
    compute[(((((blockIdx.z*100352) + (blockIdx.y*224)) + (blockIdx.x*112)) + threadIdx.x) + 50224)] = max((float32*)compute_1[7], 0f32)
    compute[(((((blockIdx.z*100352) + (blockIdx.y*224)) + (blockIdx.x*112)) + threadIdx.x) + 50240)] = max((float32*)compute_1[9], 0f32)
    compute[(((((blockIdx.z*100352) + (blockIdx.y*224)) + (blockIdx.x*112)) + threadIdx.x) + 50256)] = max((float32*)compute_1[11], 0f32)
    compute[(((((blockIdx.z*100352) + (blockIdx.y*224)) + (blockIdx.x*112)) + threadIdx.x) + 50272)] = max((float32*)compute_1[13], 0f32)
  }
}
</pre></div>
</div>
</div>
<div class="section" id="summary">
<h2>总结<a class="headerlink" href="#summary" title="永久链接至标题">¶</a></h2>
<p>In this tutorial, we have seen</p>
<ul class="simple">
<li><p>How to use TOPI API for common operations with numpy-style operators.</p></li>
<li><p>How TOPI facilitates generic schedules and operator fusion for a context, to generate optimized kernel codes.</p></li>
</ul>
<div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-tutorial-intro-topi-py">
<div class="sphx-glr-download docutils container">
<p><a class="reference download internal" download="" href="../_downloads/3a9b1d387f618487c8ccf6b8b78ae179/intro_topi.py"><code class="xref download docutils literal notranslate"><span class="pre">Python</span> <span class="pre">源码下载:</span> <span class="pre">intro_topi.py</span></code></a></p>
</div>
<div class="sphx-glr-download docutils container">
<p><a class="reference download internal" download="" href="../_downloads/63f9e50204143ea3c2d3593c72439b3d/intro_topi.ipynb"><code class="xref download docutils literal notranslate"><span class="pre">Jupyter</span> <span class="pre">notebook</span> <span class="pre">下载:</span> <span class="pre">intro_topi.ipynb</span></code></a></p>
</div>
</div>
<p class="sphx-glr-signature"><a class="reference external" href="https://sphinx-gallery.github.io">Gallery generated by Sphinx-Gallery</a></p>
</div>
</div>


           </div>
           
          </div>
          

<footer>

    <div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
      
        <a href="../how_to/index.html" class="btn btn-neutral float-right" title="How To Guides" accesskey="n" rel="next">下一个 <span class="fa fa-arrow-circle-right"></span></a>
      
      
        <a href="relay_quick_start.html" class="btn btn-neutral float-left" title="编译深度学习模型的快速开始教程" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> 上一个</a>
      
    </div>

<div id="button" class="backtop"><img src="../_static//img/right.svg" alt="backtop"/> </div>
<section class="footerSec">
    <div class="footerHeader">
      <ul class="d-flex align-md-items-center justify-content-between flex-column flex-md-row">
        <li class="copywrite d-flex align-items-center">
          <h5 id="copy-right-info">© 2020 Apache Software Foundation | All right reserved</h5>
        </li>
      </ul>

    </div>

    <ul>
      <li class="footernote">Copyright © 2020 The Apache Software Foundation. Apache TVM, Apache, the Apache feather, and the Apache TVM project logo are either trademarks or registered trademarks of the Apache Software Foundation.</li>
    </ul>

</section>
</footer>
        </div>
      </div>

    </section>

  </div>
  

    <script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.12.9/umd/popper.min.js" integrity="sha384-ApNbgh9B+Y1QKtv3Rn7W3mgPxhU9K/ScQsAP7hUibX39j7fakFPskvXusvfa0b4Q" crossorigin="anonymous"></script>
    <script src="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0/js/bootstrap.min.js" integrity="sha384-JZR6Spejh4U02d8jOt6vLEHfe/JQGiRRSQQxSfFWpi1MquVdAyjUar5+76PVCmYl" crossorigin="anonymous"></script>

  </body>
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
  </script>

  
  
    
    <!-- Theme Analytics -->
    <script>
    (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
      (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
      m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
    })(window,document,'script','https://www.google-analytics.com/analytics.js','ga');

    ga('create', 'UA-75982049-2', 'auto');
    ga('send', 'pageview');
    </script>

    
   

</body>
</html>